# Libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
## 
## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
# Import the data subsets
RHP_RHH_df <- read_csv("RHP_RHH_bip.csv")
## Rows: 144820 Columns: 42
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (15): gameid, pitcher, pitcherthrows, batter, batterside, pitchresult, ...
## dbl  (26): ab, pitchnum, inning, teambat, balls, strikes, outs, visscore, ho...
## time  (1): GameDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
RHP_LHH_df <- read_csv("RHP_LHH_bip.csv")
## Rows: 123442 Columns: 42
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (15): gameid, pitcher, pitcherthrows, batter, batterside, pitchresult, ...
## dbl  (26): ab, pitchnum, inning, teambat, balls, strikes, outs, visscore, ho...
## time  (1): GameDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
LHP_RHH_df <- read_csv("LHP_RHH_bip.csv")
## Rows: 72971 Columns: 42
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (15): gameid, pitcher, pitcherthrows, batter, batterside, pitchresult, ...
## dbl  (26): ab, pitchnum, inning, teambat, balls, strikes, outs, visscore, ho...
## time  (1): GameDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
LHP_LHH_df <- read_csv("LHP_LHH_bip.csv")
## Rows: 26371 Columns: 42
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (15): gameid, pitcher, pitcherthrows, batter, batterside, pitchresult, ...
## dbl  (26): ab, pitchnum, inning, teambat, balls, strikes, outs, visscore, ho...
## time  (1): GameDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
GAM_RR <- gam(GIDP_SweetSpot ~
           s(spinrate) +
           s(relspeed) +
           s(inducedvertbreak) +
           s(horzbreak) +
           ti(platelocside, platelocheight) +
           s(platelocside) + s(platelocheight) +
           ti(initposx, initposz) +
           s(initposx) + s(initposz), 
         data=RHP_RHH_df,
         family = binomial,
         method = "REML")

summary(GAM_RR)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## GIDP_SweetSpot ~ s(spinrate) + s(relspeed) + s(inducedvertbreak) + 
##     s(horzbreak) + ti(platelocside, platelocheight) + s(platelocside) + 
##     s(platelocheight) + ti(initposx, initposz) + s(initposx) + 
##     s(initposz)
## 
## Parametric coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.655941   0.008127  -203.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                   edf Ref.df   Chi.sq p-value    
## s(spinrate)                     3.568  4.577   15.083 0.00728 ** 
## s(relspeed)                     4.256  5.362  245.481 < 2e-16 ***
## s(inducedvertbreak)             6.208  7.404  798.282 < 2e-16 ***
## s(horzbreak)                    4.645  5.784  195.252 < 2e-16 ***
## ti(platelocside,platelocheight) 8.861 10.144  161.537 < 2e-16 ***
## s(platelocside)                 7.654  8.543  385.655 < 2e-16 ***
## s(platelocheight)               3.436  4.373 1002.230 < 2e-16 ***
## ti(initposx,initposz)           2.349  2.862   10.512 0.01578 *  
## s(initposx)                     1.159  1.302    1.152 0.42889    
## s(initposz)                     1.116  1.211   32.486 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0274   Deviance explained = 3.15%
## -REML =  64042  Scale est. = 1         n = 144820
GAM_RL <- gam(GIDP_SweetSpot ~
           s(spinrate) +
           s(relspeed) +
           s(inducedvertbreak) +
           s(horzbreak) +
           ti(platelocside, platelocheight) +
           s(platelocside) + s(platelocheight) +
           ti(initposx, initposz) +
           s(initposx) + s(initposz), 
         data=RHP_LHH_df,
         family = binomial,
         method = "REML")

summary(GAM_RL)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## GIDP_SweetSpot ~ s(spinrate) + s(relspeed) + s(inducedvertbreak) + 
##     s(horzbreak) + ti(platelocside, platelocheight) + s(platelocside) + 
##     s(platelocheight) + ti(initposx, initposz) + s(initposx) + 
##     s(initposz)
## 
## Parametric coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.795205   0.009352    -192   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                   edf Ref.df  Chi.sq  p-value    
## s(spinrate)                     5.244  6.489  60.653  < 2e-16 ***
## s(relspeed)                     5.667  6.897 540.268  < 2e-16 ***
## s(inducedvertbreak)             5.626  6.831 589.107  < 2e-16 ***
## s(horzbreak)                    7.674  8.575 152.750  < 2e-16 ***
## ti(platelocside,platelocheight) 6.896  8.348  60.948  < 2e-16 ***
## s(platelocside)                 6.087  7.290 880.565  < 2e-16 ***
## s(platelocheight)               3.751  4.737 482.716  < 2e-16 ***
## ti(initposx,initposz)           5.219  6.476  29.635 5.62e-05 ***
## s(initposx)                     1.904  2.416  40.836  < 2e-16 ***
## s(initposz)                     1.837  2.330   7.551   0.0469 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0256   Deviance explained = 3.09%
## -REML =  50753  Scale est. = 1         n = 123442
GAM_LR <- gam(GIDP_SweetSpot ~
           s(spinrate) +
           s(relspeed) +
           s(inducedvertbreak) +
           s(horzbreak) +
           ti(platelocside, platelocheight) +
           s(platelocside) + s(platelocheight) +
           ti(initposx, initposz) +
           s(initposx) + s(initposz), 
         data=LHP_RHH_df,
         family = binomial,
         method = "REML")

summary(GAM_LR)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## GIDP_SweetSpot ~ s(spinrate) + s(relspeed) + s(inducedvertbreak) + 
##     s(horzbreak) + ti(platelocside, platelocheight) + s(platelocside) + 
##     s(platelocheight) + ti(initposx, initposz) + s(initposx) + 
##     s(initposz)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.74453    0.01163    -150   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                   edf Ref.df  Chi.sq p-value    
## s(spinrate)                     3.927  4.990  47.055 < 2e-16 ***
## s(relspeed)                     4.083  5.143 199.055 < 2e-16 ***
## s(inducedvertbreak)             5.274  6.384 312.427 < 2e-16 ***
## s(horzbreak)                    7.191  8.299  31.473 0.00014 ***
## ti(platelocside,platelocheight) 7.417  8.789  55.343 < 2e-16 ***
## s(platelocside)                 6.552  7.715 274.461 < 2e-16 ***
## s(platelocheight)               2.931  3.732 314.141 < 2e-16 ***
## ti(initposx,initposz)           1.746  2.107   1.269 0.50887    
## s(initposx)                     1.003  1.007   7.049 0.00807 ** 
## s(initposz)                     1.851  2.346   4.213 0.12602    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0226   Deviance explained =  2.7%
## -REML =  30984  Scale est. = 1         n = 72971
GAM_LL <- gam(GIDP_SweetSpot ~
           s(spinrate) +
           s(relspeed) +
           s(inducedvertbreak) +
           s(horzbreak) +
           ti(platelocside, platelocheight) +
           s(platelocside) + s(platelocheight) +
           ti(initposx, initposz) +
           s(initposx) + s(initposz), 
         data=LHP_LHH_df,
         family = binomial,
         method = "REML")

summary(GAM_LL)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## GIDP_SweetSpot ~ s(spinrate) + s(relspeed) + s(inducedvertbreak) + 
##     s(horzbreak) + ti(platelocside, platelocheight) + s(platelocside) + 
##     s(platelocheight) + ti(initposx, initposz) + s(initposx) + 
##     s(initposz)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.6759     0.0193  -86.83   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                   edf Ref.df  Chi.sq  p-value    
## s(spinrate)                     1.006  1.012   1.236    0.268    
## s(relspeed)                     3.174  4.070  33.464 6.82e-07 ***
## s(inducedvertbreak)             5.494  6.678 161.309  < 2e-16 ***
## s(horzbreak)                    5.452  6.660  69.627  < 2e-16 ***
## ti(platelocside,platelocheight) 5.045  6.260  37.916  < 2e-16 ***
## s(platelocside)                 2.823  3.602  83.465  < 2e-16 ***
## s(platelocheight)               1.002  1.004 106.452  < 2e-16 ***
## ti(initposx,initposz)           1.005  1.010   0.074    0.797    
## s(initposx)                     1.002  1.004   1.322    0.251    
## s(initposz)                     1.002  1.003  19.029 1.27e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.024   Deviance explained = 2.95%
## -REML =  11464  Scale est. = 1         n = 26371
GAM_RR$sp
##                      s(spinrate)                      s(relspeed) 
##                     7.758323e+00                     2.268960e+00 
##              s(inducedvertbreak)                     s(horzbreak) 
##                     1.036654e+00                     3.942775e+00 
## ti(platelocside,platelocheight)1 ti(platelocside,platelocheight)2 
##                     1.300854e-01                     6.901210e+00 
##                  s(platelocside)                s(platelocheight) 
##                     2.422814e-01                     1.213659e+01 
##           ti(initposx,initposz)1           ti(initposx,initposz)2 
##                     1.778650e+04                     4.819995e+01 
##                      s(initposx)                      s(initposz) 
##                     6.083816e+02                     2.060975e+02
GAM_RL$sp
##                      s(spinrate)                      s(relspeed) 
##                        1.4862810                        0.8882628 
##              s(inducedvertbreak)                     s(horzbreak) 
##                        1.2006682                        0.1771016 
## ti(platelocside,platelocheight)1 ti(platelocside,platelocheight)2 
##                        3.1108314                        4.4345738 
##                  s(platelocside)                s(platelocheight) 
##                        1.0184365                        7.0181416 
##           ti(initposx,initposz)1           ti(initposx,initposz)2 
##                        6.9994421                        7.7315587 
##                      s(initposx)                      s(initposz) 
##                       35.3366361                        9.9384377
GAM_LR$sp
##                      s(spinrate)                      s(relspeed) 
##                     3.543508e+00                     1.390835e+00 
##              s(inducedvertbreak)                     s(horzbreak) 
##                     1.168677e+00                     8.071295e-02 
## ti(platelocside,platelocheight)1 ti(platelocside,platelocheight)2 
##                     3.673251e-02                     9.945469e+00 
##                  s(platelocside)                s(platelocheight) 
##                     3.034135e-01                     1.432097e+01 
##           ti(initposx,initposz)1           ti(initposx,initposz)2 
##                     2.426045e+05                     6.579429e+01 
##                      s(initposx)                      s(initposz) 
##                     1.810714e+04                     2.282283e+01
GAM_LL$sp
##                      s(spinrate)                      s(relspeed) 
##                     2.322189e+03                     2.485375e+00 
##              s(inducedvertbreak)                     s(horzbreak) 
##                     1.289867e-01                     2.700248e-01 
## ti(platelocside,platelocheight)1 ti(platelocside,platelocheight)2 
##                     3.448309e-02                     1.657037e+01 
##                  s(platelocside)                s(platelocheight) 
##                     3.630099e+00                     7.839531e+03 
##           ti(initposx,initposz)1           ti(initposx,initposz)2 
##                     4.186264e+03                     1.733346e+05 
##                      s(initposx)                      s(initposz) 
##                     1.581901e+04                     8.643324e+03
library(ggplot2)
library(RColorBrewer)
library(rlang)
## 
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
## 
##     %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
##     flatten_raw, invoke, splice

Visualizations

library(gratia)
library(ggplot2)
library(RColorBrewer)
library(rlang)
draw(GAM_LL, select = "s(relspeed)", residuals = TRUE)

draw(GAM_LL, select = "s(spinrate)", residuals = TRUE)

draw(GAM_LL, select = "s(inducedvertbreak)", residuals = TRUE)

draw(GAM_LL, select = "s(horzbreak)", residuals = TRUE)

Pitch Type Analysis

Each pitch is assigned to a cluster (0-4) based on physical pitch traits

RHP_RHH_by_cluster <- read_csv('RHP_RHH_GIDP_ByPitchCluster.csv')
## Rows: 144820 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (15): gameid, pitcher, pitcherthrows, batter, batterside, pitchresult, ...
## dbl  (27): ab, pitchnum, inning, teambat, balls, strikes, outs, visscore, ho...
## time  (1): GameDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
RHP_LHH_by_cluster <- read_csv('RHP_LHH_GIDP_ByPitchCluster.csv')
## Rows: 123442 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (15): gameid, pitcher, pitcherthrows, batter, batterside, pitchresult, ...
## dbl  (27): ab, pitchnum, inning, teambat, balls, strikes, outs, visscore, ho...
## time  (1): GameDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
LHP_RHH_by_cluster <- read_csv('LHP_RHH_GIDP_ByPitchCluster.csv')
## Rows: 72971 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (15): gameid, pitcher, pitcherthrows, batter, batterside, pitchresult, ...
## dbl  (27): ab, pitchnum, inning, teambat, balls, strikes, outs, visscore, ho...
## time  (1): GameDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
LHP_LHH_by_cluster <- read_csv('LHP_LHH_GIDP_ByPitchCluster.csv')
## Rows: 26371 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (15): gameid, pitcher, pitcherthrows, batter, batterside, pitchresult, ...
## dbl  (27): ab, pitchnum, inning, teambat, balls, strikes, outs, visscore, ho...
## time  (1): GameDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Inference by pitch type + location

Doing Zone-based visualizations

plot_zone_per_pitchtype_RHP_rev <- function(model, data, 
                                            pitch_cluster,
                                            pitch_type = NULL,
                                            title = NULL) {
  # Strike zone bounds
  k_zone_height_max <- 3.67
  k_zone_height_min <- 1.52
  sides <- c(-0.83, 0.83)
  
  # Define zone width/height and breakpoints
  zone_width  <- diff(sides)
  zone_height <- k_zone_height_max - k_zone_height_min
  x_breaks <- seq(sides[1] - zone_width/3,
                  sides[2] + zone_width/3,
                  length.out = 6)
  y_breaks <- seq(k_zone_height_min - zone_height/3,
                  k_zone_height_max + zone_height/3,
                  length.out = 6)
  
  # Cluster means for fixed covariates
  cluster_summary_rr <- data %>%
    filter(Cluster == pitch_cluster) %>%
    summarise(across(c(spinrate, relspeed, inducedvertbreak,
                       horzbreak, initposx, initposz), ~mean(.x, na.rm=TRUE)))
  
  # Prediction grid
  grid <- expand.grid(
    platelocside   = seq(sides[1] - 0.5, sides[2] + 0.5, length.out = 300),
    platelocheight = seq(k_zone_height_min - 0.5, k_zone_height_max + 0.5, length.out = 300)
  )
  for (col in names(cluster_summary_rr)) grid[[col]] <- cluster_summary_rr[[col]]
  grid$Cluster <- pitch_cluster
  
  # Predict probabilities
  grid$pred_prob <- predict(model, newdata = grid, type = "response")
  
  # Average within bins and compute boundaries
  zone_summary <- grid %>%
    mutate(
      x_bin = cut(platelocside, breaks = x_breaks, include.lowest = TRUE),
      y_bin = cut(platelocheight, breaks = y_breaks, include.lowest = TRUE)
    ) %>%
    group_by(x_bin, y_bin) %>%
    summarise(
      mean_prob = mean(pred_prob, na.rm = TRUE),
      x_min = min(platelocside),
      x_max = max(platelocside),
      y_min = min(platelocheight),
      y_max = max(platelocheight),
      .groups = "drop"
    ) %>%
    mutate(
      x_center = (x_min + x_max)/2,
      y_center = (y_min + y_max)/2
    )
  
  # ---- Plot using real coordinates (rectangles) ----
  p <- ggplot(zone_summary) +
    geom_rect(
      aes(xmin = x_min, xmax = x_max,
          ymin = y_min, ymax = y_max,
          fill = mean_prob),
      color = "white", linewidth = 0.8
    ) +
    geom_text(aes(x = x_center, y = y_center,
                  label = sprintf("%.2f", mean_prob)),
              color = "white", size = 3.8) +
    # True strike-zone outline
    geom_rect(aes(xmin = sides[1], xmax = sides[2],
                  ymin = k_zone_height_min, ymax = k_zone_height_max),
              color = "black", fill = NA, linewidth = 1.3) +
    scale_fill_viridis_c(option = "C", direction = -1, limits = c(0, 0.5)) +
    coord_equal() +
    labs(
      title = paste0(title, " — Pitch Profile: ", pitch_type,
                     " (Cluster ", pitch_cluster, ")"),
      x = "Horizontal Location (ft, Catcher View)",
      y = "Vertical Location (ft)",
      fill = "Pred. Prob."
    ) +
    theme_minimal(base_size = 14) +
    theme(
      panel.grid = element_blank(),
      plot.title = element_text(size = 13, hjust = 0.5)
    )
  
  return(p)
}

plot_zone_per_pitchtype_RHP_rev(GAM_RR, RHP_RHH_by_cluster, 0, 
                           "Slider/Cutter","RHP vs RHH")
## Warning in geom_rect(aes(xmin = sides[1], xmax = sides[2], ymin = k_zone_height_min, : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP_rev(GAM_RR, RHP_RHH_by_cluster, 1, 
                           "4-Seam Fastball","RHP vs RHH")
## Warning in geom_rect(aes(xmin = sides[1], xmax = sides[2], ymin = k_zone_height_min, : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP_rev(GAM_RR, RHP_RHH_by_cluster, 2, 
                           "Splitter/Changeup","RHP vs RHH")
## Warning in geom_rect(aes(xmin = sides[1], xmax = sides[2], ymin = k_zone_height_min, : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP_rev(GAM_RR, RHP_RHH_by_cluster, 3, 
                           "Sinker/2-Seamer","RHP vs RHH")
## Warning in geom_rect(aes(xmin = sides[1], xmax = sides[2], ymin = k_zone_height_min, : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP_rev(GAM_RR, RHP_RHH_by_cluster, 4, 
                           "Curveball/Vertical Dropper","RHP vs RHH")
## Warning in geom_rect(aes(xmin = sides[1], xmax = sides[2], ymin = k_zone_height_min, : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP_rev(GAM_RL, RHP_LHH_by_cluster, 0, 
                           "Slider/Cutter","RHP vs LHH")
## Warning in geom_rect(aes(xmin = sides[1], xmax = sides[2], ymin = k_zone_height_min, : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP_rev(GAM_RL, RHP_LHH_by_cluster, 1, 
                           "4-Seam Fastball","RHP vs LHH")
## Warning in geom_rect(aes(xmin = sides[1], xmax = sides[2], ymin = k_zone_height_min, : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP_rev(GAM_RL, RHP_LHH_by_cluster, 2, 
                           "Splitter/Changeup","RHP vs LHH")
## Warning in geom_rect(aes(xmin = sides[1], xmax = sides[2], ymin = k_zone_height_min, : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP_rev(GAM_RL, RHP_LHH_by_cluster, 3, 
                           "Sinker/2-Seamer","RHP vs LHH")
## Warning in geom_rect(aes(xmin = sides[1], xmax = sides[2], ymin = k_zone_height_min, : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP_rev(GAM_RL, RHP_LHH_by_cluster, 4, 
                           "Curveball/Vertical Dropper","RHP vs LHH")
## Warning in geom_rect(aes(xmin = sides[1], xmax = sides[2], ymin = k_zone_height_min, : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP_rev(GAM_LR, LHP_RHH_by_cluster, 0, 
                            "Curveball/Vertical Dropper", "LHP vs RHH")
## Warning in geom_rect(aes(xmin = sides[1], xmax = sides[2], ymin = k_zone_height_min, : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP_rev(GAM_LR, LHP_RHH_by_cluster, 1, 
                            "4-Seam Fastball", "LHP vs RHH")
## Warning in geom_rect(aes(xmin = sides[1], xmax = sides[2], ymin = k_zone_height_min, : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP_rev(GAM_LR, LHP_RHH_by_cluster, 2, 
                            "Slider/Cutter", "LHP vs RHH")
## Warning in geom_rect(aes(xmin = sides[1], xmax = sides[2], ymin = k_zone_height_min, : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP_rev(GAM_LR, LHP_RHH_by_cluster, 3, 
                            "Splitter/Changeup", "LHP vs RHH")
## Warning in geom_rect(aes(xmin = sides[1], xmax = sides[2], ymin = k_zone_height_min, : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP_rev(GAM_LR, LHP_RHH_by_cluster, 4, 
                            "Sinker/2-Seamer", "LHP vs RHH")
## Warning in geom_rect(aes(xmin = sides[1], xmax = sides[2], ymin = k_zone_height_min, : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP_rev(GAM_LL, LHP_LHH_by_cluster, 0, 
                            "Curveball/Vertical Dropper", "LHP vs LHH")
## Warning in geom_rect(aes(xmin = sides[1], xmax = sides[2], ymin = k_zone_height_min, : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP_rev(GAM_LL, LHP_LHH_by_cluster, 1, 
                            "4-Seam Fastball", "LHP vs LHH")
## Warning in geom_rect(aes(xmin = sides[1], xmax = sides[2], ymin = k_zone_height_min, : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP_rev(GAM_LL, LHP_LHH_by_cluster, 2, 
                            "Slider/Cutter", "LHP vs LHH")
## Warning in geom_rect(aes(xmin = sides[1], xmax = sides[2], ymin = k_zone_height_min, : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP_rev(GAM_LL, LHP_LHH_by_cluster, 3, 
                            "Splitter/Changeup", "LHP vs LHH")
## Warning in geom_rect(aes(xmin = sides[1], xmax = sides[2], ymin = k_zone_height_min, : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP_rev(GAM_LL, LHP_LHH_by_cluster, 4, 
                            "Sinker/2-Seamer", "LHP vs LHH")
## Warning in geom_rect(aes(xmin = sides[1], xmax = sides[2], ymin = k_zone_height_min, : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

# RR
RHP_RHH_by_cluster$Prediction <- predict(GAM_RR, 
                                         newdata=RHP_RHH_by_cluster,
                                         type="response")
pitch_types_ranked_RR <- RHP_RHH_by_cluster %>%
  group_by(Cluster) %>%
  summarise(
    mean_pred_prob = mean(Prediction, na.rm=TRUE),
    se_pred_prob = sd(Prediction, na.rm=TRUE) / sqrt(n()),
    n = n()
  )  %>%
  arrange(desc(mean_pred_prob)) %>%
  mutate(Rank = row_number())
pitch_types_ranked_RR
## # A tibble: 5 × 5
##   Cluster mean_pred_prob se_pred_prob     n  Rank
##     <dbl>          <dbl>        <dbl> <int> <int>
## 1       3          0.207     0.000320 34464     1
## 2       2          0.191     0.000728  8113     2
## 3       0          0.179     0.000291 38163     3
## 4       4          0.168     0.000421 22204     4
## 5       1          0.129     0.000216 41876     5
# RL
RHP_LHH_by_cluster$Prediction <- predict(GAM_RL, 
                                         newdata=RHP_LHH_by_cluster,
                                         type="response")
pitch_types_ranked_RL <- RHP_LHH_by_cluster %>%
  group_by(Cluster) %>%
  summarise(
    mean_pred_prob = mean(Prediction, na.rm=TRUE),
    se_pred_prob = sd(Prediction, na.rm=TRUE) / sqrt(n()),
    n = n()
  )  %>%
  arrange(desc(mean_pred_prob)) %>%
  mutate(Rank = row_number())
pitch_types_ranked_RL
## # A tibble: 5 × 5
##   Cluster mean_pred_prob se_pred_prob     n  Rank
##     <dbl>          <dbl>        <dbl> <int> <int>
## 1       2          0.188     0.000426 19571     1
## 2       3          0.180     0.000366 24419     2
## 3       4          0.159     0.000458 15520     3
## 4       0          0.138     0.000335 20988     4
## 5       1          0.120     0.000198 42944     5
# LR
LHP_RHH_by_cluster$Prediction <- predict(GAM_LR, 
                                         newdata=LHP_RHH_by_cluster,
                                         type="response")
pitch_types_ranked_LR <- LHP_RHH_by_cluster %>%
  group_by(Cluster) %>%
  summarise(
    mean_pred_prob = mean(Prediction, na.rm=TRUE),
    se_pred_prob = sd(Prediction, na.rm=TRUE) / sqrt(n()),
    n = n()
  )  %>%
  arrange(desc(mean_pred_prob)) %>%
  mutate(Rank = row_number())
pitch_types_ranked_LR
## # A tibble: 5 × 5
##   Cluster mean_pred_prob se_pred_prob     n  Rank
##     <dbl>          <dbl>        <dbl> <int> <int>
## 1       3          0.199     0.000484 13072     1
## 2       4          0.186     0.000385 15923     2
## 3       0          0.158     0.000544  8841     3
## 4       2          0.146     0.000381 12011     4
## 5       1          0.119     0.000234 23124     5
# LL
LHP_LHH_by_cluster$Prediction <- predict(GAM_LL, 
                                         newdata=LHP_LHH_by_cluster,
                                         type="response")
pitch_types_ranked_LL <- LHP_LHH_by_cluster %>%
  group_by(Cluster) %>%
  summarise(
    mean_pred_prob = mean(Prediction, na.rm=TRUE),
    se_pred_prob = sd(Prediction, na.rm=TRUE) / sqrt(n()),
    n = n()
  )  %>%
  arrange(desc(mean_pred_prob)) %>%
  mutate(Rank = row_number())
k_zone_height_max <- 3.67
k_zone_height_min <- 1.52
sides <- c(-0.83, 0.83)
zone_width  <- diff(sides)
zone_height <- k_zone_height_max - k_zone_height_min

x_breaks <- seq(sides[1] - zone_width/3,
                sides[2] + zone_width/3,
                length.out = 6)
y_breaks <- seq(k_zone_height_min - zone_height/3,
                k_zone_height_max + zone_height/3,
                length.out = 6)

# Expected value of each pitch over our zone
grid <- expand.grid(
  platelocside   = seq(min(x_breaks), max(x_breaks), length.out = 300),
  platelocheight = seq(min(y_breaks), max(y_breaks), length.out = 300)
)

cluster_means_LL <- LHP_LHH_by_cluster %>%
  group_by(Cluster) %>%
  summarise(across(c(spinrate, relspeed, inducedvertbreak,
                     horzbreak, initposx, initposz), \(x) mean(x, na.rm = TRUE)),
            n = n())

# Integrate expected probabilities
zone_rank_LL <- cluster_means_LL %>%
  mutate(
    expected_prob = map_dbl(Cluster, \(clust) {
      vals <- filter(cluster_means_LL, Cluster == clust)
      
      grid_tmp <- grid
      for (col in names(vals)[names(vals) != c("Cluster", "n")]) {
        grid_tmp[[col]] <- vals[[col]]
      }
      grid_tmp$Cluster <- clust
      
      preds <- predict(GAM_LL, newdata = grid_tmp, type = "response")
      mean(preds, na.rm = TRUE)
    })
  ) %>%
  mutate(
    relative_to_mean = expected_prob / mean(expected_prob, na.rm = TRUE)
  ) %>%
  select(Cluster, n, expected_prob, relative_to_mean) %>%
  arrange(desc(expected_prob))

zone_rank_LL
## # A tibble: 5 × 4
##   Cluster     n expected_prob relative_to_mean
##     <dbl> <int>         <dbl>            <dbl>
## 1       4  7234         0.225            1.32 
## 2       3   984         0.199            1.17 
## 3       2  6687         0.157            0.925
## 4       1  7184         0.144            0.848
## 5       0  4282         0.124            0.732
cluster_means_LR <- LHP_RHH_by_cluster %>%
  group_by(Cluster) %>%
  summarise(across(c(spinrate, relspeed, inducedvertbreak,
                     horzbreak, initposx, initposz), \(x) mean(x, na.rm = TRUE)),
            n = n())

# Integrate expected probabilities
zone_rank_LR <- cluster_means_LR %>%
  mutate(
    expected_prob = map_dbl(Cluster, \(clust) {
      vals <- filter(cluster_means_LR, Cluster == clust)
      
      grid_tmp <- grid
      for (col in names(vals)[names(vals) != c("Cluster", "n")]) {
        grid_tmp[[col]] <- vals[[col]]
      }
      grid_tmp$Cluster <- clust
      
      preds <- predict(GAM_LR, newdata = grid_tmp, type = "response")
      mean(preds, na.rm = TRUE)
    })
  ) %>%
  mutate(
    relative_to_mean = expected_prob / mean(expected_prob, na.rm = TRUE)
  ) %>%
  select(Cluster, n, expected_prob, relative_to_mean) %>%
  arrange(desc(expected_prob))

zone_rank_LR
## # A tibble: 5 × 4
##   Cluster     n expected_prob relative_to_mean
##     <dbl> <int>         <dbl>            <dbl>
## 1       4 15923         0.165            1.19 
## 2       3 13072         0.146            1.06 
## 3       2 12011         0.131            0.950
## 4       1 23124         0.129            0.936
## 5       0  8841         0.118            0.860
cluster_means_RL <- RHP_LHH_by_cluster %>%
  group_by(Cluster) %>%
  summarise(across(c(spinrate, relspeed, inducedvertbreak,
                     horzbreak, initposx, initposz), \(x) mean(x, na.rm = TRUE)),
            n = n())

# Integrate expected probabilities
zone_rank_RL <- cluster_means_RL %>%
  mutate(
    expected_prob = map_dbl(Cluster, \(clust) {
      vals <- filter(cluster_means_RL, Cluster == clust)
      
      grid_tmp <- grid
      for (col in names(vals)[names(vals) != c("Cluster", "n")]) {
        grid_tmp[[col]] <- vals[[col]]
      }
      grid_tmp$Cluster <- clust
      
      preds <- predict(GAM_RL, newdata = grid_tmp, type = "response")
      mean(preds, na.rm = TRUE)
    })
  ) %>%
  mutate(
    relative_to_mean = expected_prob / mean(expected_prob, na.rm = TRUE)
  ) %>%
  select(Cluster, n, expected_prob, relative_to_mean) %>%
  arrange(desc(expected_prob))

zone_rank_RL
## # A tibble: 5 × 4
##   Cluster     n expected_prob relative_to_mean
##     <dbl> <int>         <dbl>            <dbl>
## 1       3 24419         0.153            1.14 
## 2       2 19571         0.135            1.01 
## 3       1 42944         0.132            0.987
## 4       0 20988         0.126            0.945
## 5       4 15520         0.122            0.911
cluster_means_RR <- RHP_RHH_by_cluster %>%
  group_by(Cluster) %>%
  summarise(across(c(spinrate, relspeed, inducedvertbreak,
                     horzbreak, initposx, initposz), \(x) mean(x, na.rm = TRUE)),
            n = n())

# Integrate expected probabilities
zone_rank_RR <- cluster_means_RR %>%
  mutate(
    expected_prob = map_dbl(Cluster, \(clust) {
      vals <- filter(cluster_means_RR, Cluster == clust)
      
      grid_tmp <- grid
      for (col in names(vals)[names(vals) != c("Cluster", "n")]) {
        grid_tmp[[col]] <- vals[[col]]
      }
      grid_tmp$Cluster <- clust
      
      preds <- predict(GAM_RR, newdata = grid_tmp, type = "response")
      mean(preds, na.rm = TRUE)
    })
  ) %>%
  mutate(
    relative_to_mean = expected_prob / mean(expected_prob, na.rm = TRUE)
  ) %>%
  select(Cluster, n, expected_prob, relative_to_mean) %>%
  arrange(desc(expected_prob))

zone_rank_RR
## # A tibble: 5 × 4
##   Cluster     n expected_prob relative_to_mean
##     <dbl> <int>         <dbl>            <dbl>
## 1       3 34464         0.214            1.33 
## 2       2  8113         0.163            1.01 
## 3       0 38163         0.150            0.932
## 4       4 22204         0.140            0.871
## 5       1 41876         0.138            0.856

Pitch Ranking Great Table

library(gt)
pitch_types_left_left <- c("Sinker/2-Seam Fastball",
                           "Splitter/Changeup",
                      "Slider/Cutter",
                      "4-Seam Fastball",
                      "Curveball/Vertical Dropper")

pitch_types_ranked_LL <- zone_rank_LL %>%
  mutate(
    Pitch_Type = pitch_types_left_left,
    Rank = row_number()
  )


# Create the gt table
gt_LL <- pitch_types_ranked_LL %>%
  gt() %>%
  cols_move_to_start(columns = c(Rank, Pitch_Type, Cluster,
                                 expected_prob, relative_to_mean)) %>%
  fmt_number(columns = c(expected_prob, relative_to_mean), decimals = 3) %>%
  cols_label(
    Cluster = "Pitch Cluster",
    Pitch_Type = "Pitch Profile",
    expected_prob = "Expected Probability",
    n = "Sample Size",
    Rank = "Rank",
    relative_to_mean = "Relative to Average"
  ) %>%
  tab_header(
    title = "RExpected Optimal GIDP Contact Probabilities by Pitch Profil",
    subtitle = "LHP vs. LHH Matchup"
  ) %>%
  fmt_missing(everything(), missing_text = "—") %>%
  data_color(
    columns = c(Rank),
    colors = scales::col_numeric(
      palette = c("#b2182b", "#ef8a62", "#f7f7f7", "#67a9cf", "#2166ac"),
      domain = range(
        pitch_types_ranked_LL$Rank
      )
    )
  ) %>%
  data_color(
    columns = c(relative_to_mean),
    colors = scales::col_numeric(
      palette = rev(c("#b2182b", "#ef8a62", "#f7f7f7", "#67a9cf", "#2166ac")),
      domain = range(
        pitch_types_ranked_LL$relative_to_mean
      )
    )
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(columns = c(Rank, expected_prob, relative_to_mean))
  ) %>%
  tab_style(
    style = list(
      cell_text(weight = "bold", size = px(18))
    ),
    locations = cells_title(groups = "title")
  ) %>%
  tab_style(
    style = list(
      cell_text(weight = "bold", size = px(14))
    ),
    locations = cells_title(groups = "subtitle")
  ) %>%
  tab_options(
    table.font.size = 13,
    data_row.padding = px(5)
  )
## Warning: Since gt v0.6.0 `fmt_missing()` is deprecated and will soon be removed.
## ℹ Use `sub_missing()` instead.
## This warning is displayed once every 8 hours.
## Warning: Since gt v0.9.0, the `colors` argument has been deprecated.
## • Please use the `fn` argument instead.
## This warning is displayed once every 8 hours.
gt_LL
RExpected Optimal GIDP Contact Probabilities by Pitch Profil
LHP vs. LHH Matchup
Rank Pitch Profile Pitch Cluster Expected Probability Relative to Average Sample Size
1 Sinker/2-Seam Fastball 4 0.225 1.322 7234
2 Splitter/Changeup 3 0.199 1.173 984
3 Slider/Cutter 2 0.157 0.925 6687
4 4-Seam Fastball 1 0.144 0.848 7184
5 Curveball/Vertical Dropper 0 0.124 0.732 4282
pitch_types_left_right <- c("Sinker/2-Seam Fastball",
                           "Splitter/Changeup",
                      "Slider/Cutter",
                      "Curveball/Vertical Dropper",
                      "4-Seam Fastball")

pitch_types_ranked_LR <- zone_rank_LR %>%
  mutate(
    Pitch_Type = pitch_types_left_right,
    Rank = row_number()
  )


# Create the gt table
gt_LR <- pitch_types_ranked_LR %>%
  gt() %>%
  cols_move_to_start(columns = c(Rank, Pitch_Type, Cluster,
                                 expected_prob, relative_to_mean)) %>%
  fmt_number(columns = c(expected_prob, relative_to_mean), decimals = 3) %>%
  cols_label(
    Cluster = "Pitch Cluster",
    Pitch_Type = "Pitch Profile",
    expected_prob = "Expected Probability",
    n = "Sample Size",
    Rank = "Rank",
    relative_to_mean = "Relative to Average"
  ) %>%
  tab_header(
    title = "Expected Optimal GIDP Contact Probabilities by Pitch Profil",
    subtitle = "LHP vs. RHH Matchup"
  ) %>%
  fmt_missing(everything(), missing_text = "—") %>%
  data_color(
    columns = c(Rank),
    colors = scales::col_numeric(
      palette = c("#b2182b", "#ef8a62", "#f7f7f7", "#67a9cf", "#2166ac"),
      domain = range(
        pitch_types_ranked_LR$Rank
      )
    )
  ) %>%
  data_color(
    columns = c(relative_to_mean),
    colors = scales::col_numeric(
      palette = rev(c("#b2182b", "#ef8a62", "#f7f7f7", "#67a9cf", "#2166ac")),
      domain = range(
        pitch_types_ranked_LR$relative_to_mean
      )
    )
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(columns = c(Rank, expected_prob, relative_to_mean))
  ) %>%
  tab_style(
    style = list(
      cell_text(weight = "bold", size = px(18))
    ),
    locations = cells_title(groups = "title")
  ) %>%
  tab_style(
    style = list(
      cell_text(weight = "bold", size = px(14))
    ),
    locations = cells_title(groups = "subtitle")
  ) %>%
  tab_options(
    table.font.size = 13,
    data_row.padding = px(5)
  )

gt_LR
Expected Optimal GIDP Contact Probabilities by Pitch Profil
LHP vs. RHH Matchup
Rank Pitch Profile Pitch Cluster Expected Probability Relative to Average Sample Size
1 Sinker/2-Seam Fastball 4 0.165 1.194 15923
2 Splitter/Changeup 3 0.146 1.061 13072
3 Slider/Cutter 2 0.131 0.950 12011
4 Curveball/Vertical Dropper 1 0.129 0.936 23124
5 4-Seam Fastball 0 0.118 0.860 8841
pitch_types_right_left <- c("Sinker/2-Seam Fastball",
                            "Splitter/Changeup",
                            "4-Seam Fastball",
                             "Slider/Cutter",
                      "Curveball/Vertical Dropper")

pitch_types_ranked_RL <- zone_rank_RL %>%
  mutate(
    Pitch_Type = pitch_types_right_left,
    Rank = row_number()
  )


# Create the gt table
gt_RL <- pitch_types_ranked_RL %>%
  gt() %>%
  cols_move_to_start(columns = c(Rank, Pitch_Type, Cluster,
                                 expected_prob, relative_to_mean)) %>%
  fmt_number(columns = c(expected_prob, relative_to_mean), decimals = 3) %>%
  cols_label(
    Cluster = "Pitch Cluster",
    Pitch_Type = "Pitch Profile",
    expected_prob = "Expected Probability",
    n = "Sample Size",
    Rank = "Rank",
    relative_to_mean = "Relative to Average"
  ) %>%
  tab_header(
    title = "Expected Optimal GIDP Contact Probabilities by Pitch Profil",
    subtitle = "RHP vs. LHH Matchup"
  ) %>%
  fmt_missing(everything(), missing_text = "—") %>%
  data_color(
    columns = c(Rank),
    colors = scales::col_numeric(
      palette = c("#b2182b", "#ef8a62", "#f7f7f7", "#67a9cf", "#2166ac"),
      domain = range(
        pitch_types_ranked_RL$Rank
      )
    )
  ) %>%
  data_color(
    columns = c(relative_to_mean),
    colors = scales::col_numeric(
      palette = rev(c("#b2182b", "#ef8a62", "#f7f7f7", "#67a9cf", "#2166ac")),
      domain = range(
        pitch_types_ranked_RL$relative_to_mean
      )
    )
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(columns = c(Rank, expected_prob, relative_to_mean))
  ) %>%
  tab_style(
    style = list(
      cell_text(weight = "bold", size = px(18))
    ),
    locations = cells_title(groups = "title")
  ) %>%
  tab_style(
    style = list(
      cell_text(weight = "bold", size = px(14))
    ),
    locations = cells_title(groups = "subtitle")
  ) %>%
  tab_options(
    table.font.size = 13,
    data_row.padding = px(5)
  )

gt_RL
Expected Optimal GIDP Contact Probabilities by Pitch Profil
RHP vs. LHH Matchup
Rank Pitch Profile Pitch Cluster Expected Probability Relative to Average Sample Size
1 Sinker/2-Seam Fastball 3 0.153 1.144 24419
2 Splitter/Changeup 2 0.135 1.013 19571
3 4-Seam Fastball 1 0.132 0.987 42944
4 Slider/Cutter 0 0.126 0.945 20988
5 Curveball/Vertical Dropper 4 0.122 0.911 15520
pitch_types_right_right <- c("Sinker/2-Seam Fastball",
                             "Splitter/Changeup",
                             "Slider/Cutter",
                      "Curveball/Vertical Dropper",
                      "4-Seam Fastball")

pitch_types_ranked_RR <- zone_rank_RR %>%
  mutate(
    Pitch_Type = pitch_types_right_right,
    Rank = row_number()
  )


# Create the gt table
gt_RR <- pitch_types_ranked_RR %>%
  gt() %>%
  cols_move_to_start(columns = c(Rank, Pitch_Type, Cluster,
                                 expected_prob, relative_to_mean)) %>%
  fmt_number(columns = c(expected_prob, relative_to_mean), decimals = 3) %>%
  cols_label(
    Cluster = "Pitch Cluster",
    Pitch_Type = "Pitch Profile",
    expected_prob = "Expected Probability",
    n = "Sample Size",
    Rank = "Rank",
    relative_to_mean = "Relative to Average"
  ) %>%
  tab_header(
    title = "Expected Optimal GIDP Contact Probabilities by Pitch Profile",
    subtitle = "RHP vs. RHH Matchup"
  ) %>%
  fmt_missing(everything(), missing_text = "—") %>%
  data_color(
    columns = c(Rank),
    colors = scales::col_numeric(
      palette = c("#b2182b", "#ef8a62", "#f7f7f7", "#67a9cf", "#2166ac"),
      domain = range(
        pitch_types_ranked_RR$Rank
      )
    )
  ) %>%
  data_color(
    columns = c(relative_to_mean),
    colors = scales::col_numeric(
      palette = rev(c("#b2182b", "#ef8a62", "#f7f7f7", "#67a9cf", "#2166ac")),
      domain = range(
        pitch_types_ranked_RR$relative_to_mean
      )
    )
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(columns = c(Rank, expected_prob, relative_to_mean))
  ) %>%
  tab_style(
    style = list(
      cell_text(weight = "bold", size = px(18))
    ),
    locations = cells_title(groups = "title")
  ) %>%
  tab_style(
    style = list(
      cell_text(weight = "bold", size = px(14))
    ),
    locations = cells_title(groups = "subtitle")
  ) %>%
  tab_options(
    table.font.size = 13,
    data_row.padding = px(5)
  )

gt_RR
Expected Optimal GIDP Contact Probabilities by Pitch Profile
RHP vs. RHH Matchup
Rank Pitch Profile Pitch Cluster Expected Probability Relative to Average Sample Size
1 Sinker/2-Seam Fastball 3 0.214 1.330 34464
2 Splitter/Changeup 2 0.163 1.011 8113
3 Slider/Cutter 0 0.150 0.932 38163
4 Curveball/Vertical Dropper 4 0.140 0.871 22204
5 4-Seam Fastball 1 0.138 0.856 41876
library(gratia)
smooth_eff_RR <- gratia::smooth_estimates(GAM_RR)

feature_rank_RR <- smooth_eff_RR %>%
  group_by(.smooth) %>%
  summarise(
    effect_range = max(.estimate) - min(.estimate),
    effect_sd = sd(.estimate)
  ) %>%
  arrange(desc(effect_range))

feature_rank_RR
## # A tibble: 10 × 3
##    .smooth                         effect_range effect_sd
##    <chr>                                  <dbl>     <dbl>
##  1 ti(platelocside,platelocheight)       6.55      0.947 
##  2 s(platelocside)                       3.09      0.838 
##  3 s(relspeed)                           2.17      0.697 
##  4 s(inducedvertbreak)                   2.11      0.584 
##  5 s(platelocheight)                     1.91      0.596 
##  6 ti(initposx,initposz)                 1.69      0.198 
##  7 s(horzbreak)                          0.940     0.277 
##  8 s(initposz)                           0.780     0.229 
##  9 s(spinrate)                           0.301     0.0881
## 10 s(initposx)                           0.0644    0.0192
smooth_eff_RL <- gratia::smooth_estimates(GAM_RL)

feature_rank_RL <- smooth_eff_RL %>%
  group_by(.smooth) %>%
  summarise(
    effect_range = max(.estimate) - min(.estimate),
    effect_sd = sd(.estimate)
  ) %>%
  arrange(desc(effect_range))

feature_rank_RL
## # A tibble: 10 × 3
##    .smooth                         effect_range effect_sd
##    <chr>                                  <dbl>     <dbl>
##  1 s(inducedvertbreak)                    2.65     0.710 
##  2 s(platelocside)                        2.46     0.730 
##  3 s(relspeed)                            1.98     0.662 
##  4 ti(platelocside,platelocheight)        1.73     0.251 
##  5 s(platelocheight)                      1.37     0.425 
##  6 s(horzbreak)                           0.780    0.204 
##  7 s(initposx)                            0.678    0.203 
##  8 ti(initposx,initposz)                  0.451    0.0765
##  9 s(spinrate)                            0.282    0.0914
## 10 s(initposz)                            0.218    0.0599
smooth_eff_LR <- gratia::smooth_estimates(GAM_LR)

feature_rank_LR <- smooth_eff_LR %>%
  group_by(.smooth) %>%
  summarise(
    effect_range = max(.estimate) - min(.estimate),
    effect_sd = sd(.estimate)
  ) %>%
  arrange(desc(effect_range))

feature_rank_LR
## # A tibble: 10 × 3
##    .smooth                         effect_range effect_sd
##    <chr>                                  <dbl>     <dbl>
##  1 ti(platelocside,platelocheight)       26.7      2.80  
##  2 s(platelocside)                        4.84     1.51  
##  3 s(inducedvertbreak)                    3.30     0.884 
##  4 s(horzbreak)                           3.25     0.848 
##  5 s(platelocheight)                      1.75     0.513 
##  6 s(relspeed)                            1.57     0.539 
##  7 ti(initposx,initposz)                  0.672    0.0803
##  8 s(spinrate)                            0.639    0.214 
##  9 s(initposz)                            0.255    0.0871
## 10 s(initposx)                            0.248    0.0727
smooth_eff_LL <- gratia::smooth_estimates(GAM_LL)

feature_rank_LL <- smooth_eff_LL %>%
  group_by(.smooth) %>%
  summarise(
    effect_range = max(.estimate) - min(.estimate),
    effect_sd = sd(.estimate)
  ) %>%
  arrange(desc(effect_range))

feature_rank_LL
## # A tibble: 10 × 3
##    .smooth                         effect_range effect_sd
##    <chr>                                  <dbl>     <dbl>
##  1 ti(platelocside,platelocheight)       15.0      1.84  
##  2 s(inducedvertbreak)                    8.46     2.72  
##  3 s(platelocheight)                      2.15     0.629 
##  4 s(platelocside)                        1.75     0.483 
##  5 s(relspeed)                            1.67     0.518 
##  6 s(initposz)                            0.887    0.260 
##  7 s(horzbreak)                           0.829    0.289 
##  8 s(spinrate)                            0.309    0.0905
##  9 s(initposx)                            0.165    0.0482
## 10 ti(initposx,initposz)                  0.159    0.0235
plot(GAM_RR,
     scale = 0,         # use same y-scale across plots (better for comparison)
     ylim = c(-3, 3),   # fix y-axis limits
     rug = TRUE,        # show tick marks for data density
     col = "navy",
     ylab = "Partial Effect on GIDP Probability"
)

plot(GAM_RL)

plot(GAM_LR)

plot(GAM_LL)